Sample name: SRX1025890_TC32_NT_DRIP.hg38

Sample type: DRIP-Seq

Genome: hg38

Time finished: Thu Jun 10 18:51:30 2021

Quality Report


Genome feature enrichment

R-loop broad peaks were called with macs2 and then compared with genomic features using assignGenomeAnnotation from homer.

anno_cats <- data.frame(
  "Annotation" = c('Intergenic', 'Simple_repeat', 'Satellite', 'Promoter', 'pseudo', 'Intron', 'TTS', 'LINE', 'LTR', 'SINE', 'DNA', 'CpG-Island', 'ncRNA', 'Low_complexity', 'Exon', 'snRNA', 'Retroposon', '5UTR', '3UTR', 'srpRNA', 'tRNA', 'RC', 'scRNA', 'miRNA', 'RNA', 'snoRNA'),
  "annotate_type" = c("gene", "rep", "rep", "gene", "RNA", "gene", "gene", 
             "rep", "rep", "rep", "rep", "gene", "RNA", "rep",
             "gene", "RNA", "rep", "gene", "gene", "RNA",
             "RNA", "RNA", "RNA", "RNA", "RNA", "RNA"), stringsAsFactors = FALSE
)


anno_data_plt <- anno_data %>%
  dplyr::inner_join(y = anno_cats, by = "Annotation")

minplt <- min(anno_data_plt$`Log2 Ratio (obs/exp)`) * 1.05
maxplt <- max(anno_data_plt$`Log2 Ratio (obs/exp)`) * 1.05

# TODO: Add average to this with SEM
plt1 <- anno_data_plt %>%
  dplyr::filter(annotate_type == "gene") %>%
  ggplot2::ggplot(mapping = ggplot2::aes(x = factor(.data$Annotation, 
                                                    levels = c("CpG-Island",
                                                         "Promoter",
                                                         "5UTR",
                                                         "Exon",
                                                         "Intron",
                                                         "3UTR",
                                                         "TTS",
                                                         "Intergenic")), 
                                         y = .data$`Log2 Ratio (obs/exp)`)) +
  ggplot2::geom_col(fill = "firebrick") +
  ggplot2::ylab("Log2 Enrichment (obs/exp)") +
  ggplot2::xlab(NULL) +
  ggplot2::labs(title = "Genomic Features") +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
                                                     hjust = 1)) +
  ggplot2::ylim(minplt, maxplt)

plt2 <- anno_data_plt %>%
  dplyr::filter(annotate_type == "RNA") %>%
  ggplot2::ggplot(mapping = ggplot2::aes(x = .data$Annotation,
                       y = .data$`Log2 Ratio (obs/exp)`)) +
  ggplot2::geom_col(fill = "goldenrod") +
  ggplot2::ylab(NULL) +
  ggplot2::xlab("Annotation") +
  ggplot2::labs(title = "ncRNAs") +
  ggplot2::theme_bw() + 
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
                                                     hjust = 1)) +
  ggplot2::ylim(minplt, maxplt)

plt3 <- anno_data_plt %>%
  dplyr::filter(annotate_type == "rep") %>%
  ggplot2::ggplot(mapping = ggplot2::aes(x = .data$Annotation, 
                                y = .data$`Log2 Ratio (obs/exp)`)) +
  ggplot2::geom_col(fill = "forestgreen") +
  ggplot2::ylab(NULL) +
  ggplot2::xlab(NULL) +
  ggplot2::labs(title = "Repetitive Elements") +
  ggplot2::theme_bw() +
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, vjust = 0.5,
                                                     hjust = 1)) +
  ggplot2::ylim(minplt, maxplt)

gt1 <- ggplot2::ggplotGrob(plt1)
gt2 <- ggplot2::ggplotGrob(plt2)
gt3 <- ggplot2::ggplotGrob(plt3)
if (length(plt2$data$Annotation) && length(plt3$data$Annotation)) {
  plts <- cbind(gt1, gt2, gt3, size = "last")
} else if (length(plt2$data$Annotation)) {
  plts <- cbind(gt1, gt2, size = "last")
} else {
  plts <- gt1
}
grid::grid.newpage()
grid::grid.draw(plts)


R-Loop Forming Sequences (RLFS) Enrichment

RLFS were derived across the genome using QmRLFS-finder.py. R-loop broad peaks were called with macs2 and then compared with RLFS using permTest from the RegioneR R package. An empirical distribution of RLFS was generated using the circularRandomizeRegions method and compared to the peaks in order to calculate enrichment p value and zscore (effect size of enrichment).

pt <- rlfs_data[[1]]
pval <- pt[[1]]$pval
ntimes <- pt[[1]]$ntimes
zscore <- pt[[1]]$zscore

rlfs_pval_color <- ifelse(pval > .05, 'red', ifelse(pval > .01, 'orange', 'green'))
rlfs_zs_color <- ifelse(zscore < 5, 'red', ifelse(zscore < 15, 'orange', 'green'))

lz <- rlfs_data[[2]]

From this analysis, the empirically-determined p value was 0.001996 (with 500 permutations, the minimum possible p value was 0.001996). The enrichment z-score was 26.2923.

plot(pt)

to_plot <- data.frame("zscore" = lz$`regioneR::numOverlaps`$shifted.z.scores,
                      "shift" = lz$`regioneR::numOverlaps`$shifts)
plt <- ggplot2::ggplot(to_plot, ggplot2::aes(y = .data$zscore,
                                             x = .data$shift)) +
  ggplot2::geom_line() +
  ggplot2::labs(title = "Peak enrichment around RLFS") +
  ggplot2::ylab("Peak Enrichment (Z-Score)") +
  ggplot2::xlab("Distance to RLFS (bp)") +
  ggplot2::theme_bw(base_size = 15)
plotly::ggplotly(plt)

Correlation analysis

Using the method described in Chedin et al. 2020, the correlation between SRX1025890_TC32_NT_DRIP.hg38 and the other samples in RMapDB was calculated. The resulting correlation matrix is available in the QC folder.

anno_now <- corr_data$annoNow
sample_name <- configlist$sample_name
anno_now$correlations <- corr_data$corMat[,sample_name]
anno_now$sample_name <- rownames(anno_now)
anno_now <- anno_now[order(anno_now$correlations, decreasing = FALSE), ]
anno_now$rank <- seq(anno_now$sample_name)
plt <- ggplot2::ggplot(anno_now,
                       mapping = ggplot2::aes(x = .data$rank, 
                                              label = .data$sample_name,
                                              y = .data$correlations, 
                                              color = .data$Mode)) +
  ggplot2::geom_point() +
  ggplot2::ylab("Correlation (R)") +
  ggplot2::xlab(NULL) +
  ggplot2::labs(title = paste0(sample_name, " correlation with RMapDB")) +
  ggplot2::theme_bw()
plotly::ggplotly(p = plt)

Correlation heatmap (full-size image in QC folder)

annoNow <- corr_data$annoNow 
corMat <- corr_data$corMat
# Make static correlation picture
# From: https://stackoverflow.com/questions/31677923/set-0-point-for-pheatmap-in-r
paletteLength <- 100
myColor <- colorRampPalette(rev(
  RColorBrewer::brewer.pal(n = 7, name = "RdBu")))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(min(corMat), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(corMat)/paletteLength, max(corMat), length.out=floor(paletteLength/2)))
pheatmap::pheatmap(corMat, show_rownames = FALSE,
                   show_colnames = FALSE,
                   annotation_colors = list(
                     Source = c("RMapDB" = "grey", "User-supplied" = "firebrick")
                   ), 
                   color = myColor, breaks = myBreaks,
                   annotation_col = annoNow)


Peak calling

# TODO: What should this number be? Can we arrive at a more robust estimate?
total_peaks_color <- ifelse(total_peaks < 3000, 'red', ifelse(total_peaks < 6000, 'orange', 'green'))

The peak caller macs was used for peak calling. Number of peaks called: 2836


Mapping quality

reads_aligned <- bam_stats$reads_aligned / 1e6
duplicate_reads <- bam_stats$duplicate_reads / 1e6

if (show_readqc) {
  total_reads <- bam_stats$total_reads / 1e6
  total_reads_color <- "black"
  pct_aligned <- round(100*(reads_aligned/total_reads), digits = 2)
  pct_align_color <- ifelse(pct_aligned > 80, "green", ifelse(pct_aligned > 60, "orange", "red"))
  pct_duplicated <- round(100*(duplicate_reads/total_reads), digits = 2)
  pct_duplicated_color <- ifelse(pct_duplicated < 35, "green", ifelse(pct_aligned < 50, "orange", "red"))
} else {
  total_reads <- "N/A"
  total_reads_color <- "grey"
  pct_aligned <- "N/A"
  pct_align_color <- "grey"
  pct_duplicated <- "N/A"
  pct_duplicated_color <- "grey"
}

genome <- configlist$genome

Reads were aligned to hg38 with bwa mem.

Total reads (millions): N/A

Aligned reads (millions): 0.983307

Percent of reads aligned: N/A

Duplicated reads (millions): 0.022535

Percent of reads duplicated: N/A


 

RSeq © 2021, Bishop Lab, UT Health San Antonio

RSeq maintainer: